global input "Q:\dc1prhcmsas01\PU2\data - sas"
global temp "Q:\dc1prhcmsas01\PU2\temp_stata_wr"
global log "Q:\dc1prhcmsas01\PU2\Log - Stata WR"
global output "Q:\dc1prhcmsas01\PU2\data_stata_wr"

cd "Q:\dc1prhcmsas01\PU2\Code - Stata WR"

capture log close
log using "$log/2_clean_wr", replace

*Clean one month of data
cap program drop clean
program define clean 
args year month
	
	*Import data
	if `year'<=2020 | (`year'==2021 & `month'<=10) {
		if `year'==2013{
			if `month'<=11 {
				import sas CLIENT_CODE YR_MONTH EMP_PUR_C EMP_STAT_C ADDR_ST_C PAY_FREQ_C STD_RT_TYPE_C STD_PAY_RT_A GEN_C ERN2_A HRS2_Q GROSS_ERN_A TOT_HRS_Q PRI_MRTL_STAT_C ///
		MED_SPCL_COMP_A DNT_SPCL_COMP_A HCC_SPCL_COMP_A DK_SPCL_COMP_A DH_SPCL_COMP_A DF_SPCL_COMP_A DX_SPCL_COMP_A DY_SPCL_COMP_A DZ_SPCL_COMP_A DP_SPCL_COMP_A ///
		DS_SPCL_COMP_A DW_SPCL_COMP_A using "$input\ap_`year'`month'.sas7bdat", case(lower) clear
			}
			else{
				import sas client_code YR_MONTH EMP_PUR_C EMP_STAT_C ADDR_ST_C PAY_FREQ_C STD_RT_TYPE_C STD_PAY_RT_A GEN_C ERN2_A HRS2_Q GROSS_ERN_A TOT_HRS_Q PRI_MRTL_STAT_C ///
		MED_SPCL_COMP_A DNT_SPCL_COMP_A HCC_SPCL_COMP_A DK_SPCL_COMP_A DH_SPCL_COMP_A DF_SPCL_COMP_A DX_SPCL_COMP_A DY_SPCL_COMP_A DZ_SPCL_COMP_A DP_SPCL_COMP_A ///
		DS_SPCL_COMP_A DW_SPCL_COMP_A using "$input\ap_`year'`month'.sas7bdat", case(lower) clear
			}
			
		*Retire and health 
		gen health = round(med_spcl_comp_a+dnt_spcl_comp_a+hcc_spcl_comp_a)
		replace health=0 if health==.
		
		gen retire= round(dk_spcl_comp_a+ dh_spcl_comp_a+ df_spcl_comp_a+ dx_spcl_comp_a+ dy_spcl_comp_a+ dz_spcl_comp_a+ dp_spcl_comp_a+ ds_spcl_comp_a+ dw_spcl_comp_a)
		replace retire=0 if retire==.
		}
		else if `year'==2015 & `month'<=10 & `month'!=4 {
		import sas CLIENT_CODE YR_MONTH EMP_PUR_C EMP_STAT_C ADDR_ST_C PAY_FREQ_C STD_RT_TYPE_C STD_PAY_RT_A GEN_C ERN2_A HRS2_Q GROSS_ERN_A TOT_HRS_Q health retire PRI_MRTL_STAT_C using "$input\ap_`year'`month'.sas7bdat", case(lower) clear
		}
		else {
		import sas client_code YR_MONTH EMP_PUR_C EMP_STAT_C ADDR_ST_C PAY_FREQ_C STD_RT_TYPE_C STD_PAY_RT_A GEN_C ERN2_A HRS2_Q GROSS_ERN_A TOT_HRS_Q health retire PRI_MRTL_STAT_C using "$input\ap_`year'`month'.sas7bdat", case(lower) clear
		}
	
	*Clean
		keep if emp_stat_c!="T"
		rename addr_st_c state
		
	*Define salaried
		gen salaried=std_rt_type_c=="S"
		
	*Define base salary
		gen base=.
		replace base=round(std_pay/2,0.01) if std_rt_type_c=="S" & pay_freq=="2"
		replace base=round(std_pay,0.01) if std_rt_type_c=="S" & pay_freq=="4"
		replace base=round(std_pay,0.01) if std_rt_type_c=="S" & pay_freq=="5"
		replace base=round(std_pay/2,0.01) if std_rt_type_c=="S" & pay_freq=="B"
		replace base=round(std_pay*12/52,0.01) if std_rt_type_c=="S" & pay_freq=="M"
		replace base=round(std_pay*24/52,0.01) if std_rt_type_c=="S" & pay_freq=="S"
		replace base=round(std_pay,0.01) if std_rt_type_c=="S" & pay_freq=="W"
		replace base=round(std_pay*40,0.01) if std_rt_type_c=="H"
		
	*Merge on number of paychecks per month
		replace pay_freq="B" if pay_freq=="2"
		replace pay_freq="W" if pay_freq=="4" | pay_freq=="5"
		
		merge m:1 client_code yr_month pay_freq using "$output/number_paychecks"
		keep if _merge==3 | _merge==1
		drop _merge

	*Define average weekly pay from monthly pay
		replace ern2_a=0 if ern2_a==.
		gen ot=0
		replace ot=ern2_a if ern2_a/(hrs2_q*base/40)<=2.5
		
		rename gross gross
		rename tot_hrs_q hours
		
		foreach var of varlist ot gross hours retire health {
		    replace `var'=`var'/(2*num_pay_med) if pay_freq=="B"
			replace `var'=`var'*12/52 if pay_freq=="M" | pay_freq=="S"
			replace `var'=`var'/(num_pay_med) if pay_freq=="W"
		}
		
	*Define wage bill
		gen bill=base+ot
	
	*Gender
		gen male=gen_c=="M"
		gen married = pri_mrtl_stat_c=="M"
	
	*Keep key variables
	keep yr_month state client_code emp_pur_c salaried base ot gross male hours retire health bill married
	
	*Collapse multiple entries per state-firm-person in a month (about 0.6% of sample) - sometimes salaried and hourly in same month (about 0.11% of sample)
	count
	collapse (mean) salaried male married (sum) base ot gross hours retire health bill, by(yr_month state client_code emp_pur_c)
	drop if salaried!=1 & salaried!=0
	count
				
	*Merge client id and naics
		merge m:1 client_code using "$output/client_crosswalk"
		keep if _merge==3
		drop _merge
		
	*Save worker level info
	compress
	save "$temp\cleaned_`year'`month'", replace
	}
end

*Clean every month
forvalues year=2013/2020 {
    forvalues month=1/12 {
		if `month'<10 {
			clean `year' 0`month'
		}
		else {
			clean `year' `month'
		}
	}
}

log close
